load packages

library(tidyverse)
library(glmnet)
library(caTools)
library(caret)
library(ROCR)

load coded data

coded = read.csv("~/Documents/code/dsnlab/automotion-test-set/FP_manuallyCoded_edited2.csv") %>%
  select(subjectID, run, volume, trash) %>%
  rename("artifact" = trash)

load stripe detection data

fileDir = "~/Documents/code/dsnlab/automotion-test-set/output/FP/"
filePattern = "FP_stripes_.*.csv"
  
file_list = list.files(fileDir, pattern = filePattern)

for (file in file_list){
  # if the merged dataset doesn't exist, create it
  if (!exists("stripes")){
    temp = read.csv(paste0(fileDir,file))
    stripes = data.frame(temp) %>% 
      rename("volume" = t,
             "subjectID" = subject) %>%
      select(-file)
    rm(temp)
  }
  
  # if the merged dataset does exist, append to it
  else {
    temp_dataset = read.csv(paste0(fileDir,file))
    temp_dataset = data.frame(temp_dataset) %>% 
      rename("volume" = t,
             "subjectID" = subject) %>%
      select(-file)
    stripes = rbind(stripes, temp_dataset)
    rm(temp_dataset)
  }
}

load global intensities and rps

# define paths and variables
rpDir = '~/Documents/code/dsnlab/automotion-test-set/FP_rp_txt/'
# outputDir = '~/Documents/code/tds_auto-motion/auto-motion-output/'
# plotDir = '~/Documents/code/tds_auto-motion/auto-motion-output/plots/'
study = "FP"
rpPattern = '^rp_(FP[0-9]{3})_(.*).txt'
rpCols = c("euclidian_trans","euclidian_rot","euclidian_trans_deriv","euclidian_rot_deriv","trash.rp")

# global intensities
intensities = read.csv(paste0('~/Documents/code/dsnlab/automotion-test-set/',study,'_globalIntensities.csv'))

# rp files
file_list = list.files(rpDir, pattern = rpPattern)

for (file in file_list){
  # if the merged dataset doesn't exist, create it
  if (!exists("rp")){
    temp = read.table(paste0(rpDir,file))
    colnames(temp) = rpCols
    rp = data.frame(temp, file = rep(file,count(temp))) %>% 
      mutate(volume = row_number()) %>%
      extract(file,c("subjectID","run"), rpPattern)
    rm(temp)
  }
  
  # if the merged dataset does exist, append to it
  else {
    temp_dataset = read.table(paste0(rpDir,file))
    colnames(temp_dataset) = rpCols
    temp_dataset = data.frame(temp_dataset, file = rep(file,count(temp_dataset))) %>% 
      mutate(volume = row_number()) %>%
      extract(file,c("subjectID","run"), rpPattern)
    rp = rbind(rp, temp_dataset)
    rm(temp_dataset)
  }
}

join dataframes

joined = left_join(stripes, coded, by = c("subjectID", "run", "volume")) %>%
  left_join(., intensities, by = c("subjectID", "run", "volume")) %>%
  left_join(., rp, by = c("subjectID", "run", "volume")) %>%
  mutate(tile = paste0("tile_",tile)) %>%
  group_by(subjectID, run, tile) %>%
  mutate(Diff.mean = volMean - lag(volMean),
         Diff.sd = volSD - lag(volSD)) %>%
  spread(tile, freqtile_power)

split the data

set.seed(101) 
sample = sample.split(joined$artifact, SplitRatio = .75)
training = subset(joined, sample == TRUE)
testing = subset(joined, sample == FALSE)

machine learning

tidy training and testing data

train.ml = training %>%
  group_by(subjectID, run) %>%
  mutate(Diff.mean = ifelse(is.na(Diff.mean),0,Diff.mean),
         Diff.sd = ifelse(is.na(Diff.sd),0,Diff.sd)) %>%
  gather(tile,freqtile_power, starts_with("tile")) %>%
  mutate(tile = paste0(tile,"_c")) %>%
  group_by(subjectID, run, tile) %>%
  mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
  ungroup() %>%
  select(-freqtile_power) %>%
  spread(tile,freqtile_power_c) %>%
  select(-trash.rp, -volMean, -volSD, -euclidian_rot, -euclidian_trans) %>%
  select(subjectID, run, volume, artifact, everything())

test.ml = testing %>%
  group_by(subjectID, run) %>%
  mutate(Diff.mean = ifelse(is.na(Diff.mean),0,Diff.mean),
         Diff.sd = ifelse(is.na(Diff.sd),0,Diff.sd)) %>%
  gather(tile,freqtile_power, starts_with("tile")) %>%
  mutate(tile = paste0(tile,"_c")) %>%
  group_by(subjectID, run, tile) %>%
  mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
  ungroup() %>%
  select(-freqtile_power) %>%
  spread(tile,freqtile_power_c) %>%
  select(-trash.rp, -volMean, -volSD, -euclidian_rot, -euclidian_trans) %>%
  select(subjectID, run, volume, artifact, everything())

use lasso logistic regression to fit beta weights for each predictor

training

# subset predictors and criterion
x_train = as.matrix(train.ml[,-c(1,2,3,4)])
y_train = as.double(as.matrix(train.ml[, 4]))

# run xval to determine lambda
cv.train <- cv.glmnet(x_train, y_train, family='binomial', alpha=1, parallel=TRUE, standardize=TRUE, type.measure='auc')

plot(cv.train)

plot(cv.train$glmnet.fit, xvar="lambda", label=TRUE)

cv.train$lambda.min
## [1] 0.001160478
cv.train$lambda.1se
## [1] 0.00564294
coef(cv.train, s=cv.train$lambda.min)
## 16 x 1 sparse Matrix of class "dgCMatrix"
##                                    1
## (Intercept)              -4.52394327
## euclidian_trans_deriv     .         
## euclidian_rot_deriv       1.32222698
## Diff.mean                 0.01993830
## Diff.sd                  -0.03976028
## tile_1_c                -33.99427335
## tile_10_c              5652.90255527
## tile_11_c                 .         
## tile_2_c                -10.78123078
## tile_3_c              -1156.19857225
## tile_4_c                  .         
## tile_5_c                 61.23874115
## tile_6_c               -431.10703537
## tile_7_c                  .         
## tile_8_c               3396.60610294
## tile_9_c                  .
coef(cv.train, s=cv.train$lambda.1se)
## 16 x 1 sparse Matrix of class "dgCMatrix"
##                                    1
## (Intercept)             -4.130248103
## euclidian_trans_deriv    .          
## euclidian_rot_deriv      .          
## Diff.mean                0.008482909
## Diff.sd                 -0.012461183
## tile_1_c               -25.707434181
## tile_10_c             3004.758521999
## tile_11_c                .          
## tile_2_c                 .          
## tile_3_c                 .          
## tile_4_c                 .          
## tile_5_c                 .          
## tile_6_c                 .          
## tile_7_c                 .          
## tile_8_c              1860.356734575
## tile_9_c                 .
# test on sample
pred_train = predict(cv.train, newx = x_train, s=cv.train$lambda.1se, type="response")

# plot cutoff v. accuracy
predicted = prediction(pred_train, y_train, label.ordering = NULL)
perf = performance(predicted, measure = "acc")
perf.df = data.frame(cut=perf@x.values[[1]],acc=perf@y.values[[1]])

ggplot(perf.df, aes(cut, acc)) +
  geom_line()

# plot false v. true positive rate
perf = performance(predicted, measure = "tpr", x.measure = "fpr")
perf.df = data.frame(cut=perf@alpha.values[[1]],fpr=perf@x.values[[1]],tpr=perf@y.values[[1]])

ggplot(perf.df, aes(fpr, tpr)) +
  geom_line()

# plot specificity v. sensitivity
perf = performance(predicted, measure = "sens", x.measure = "spec")
perf.df = data.frame(cut=perf@alpha.values[[1]],sens=perf@x.values[[1]],spec=perf@y.values[[1]])
ggplot(perf.df, aes(spec, sens)) +
  geom_line()

ggplot(perf.df, aes(x = cut)) +
  geom_line(aes(y = sens, color = "sensitivity")) + 
  geom_line(aes(y = spec, color = "specificity"))

perf@alpha.values[[1]][which.max(perf@x.values[[1]]+perf@y.values[[1]])] # cut
## [1] 0.02925047
max(perf@x.values[[1]]+perf@y.values[[1]]) # sensitivity + specificity
## [1] 1.687121
# confusion matrix
pred_train = predict(cv.train, newx = x_train, s=cv.train$lambda.1se, type="response")
pred_train[pred_train > .05] = 1
pred_train[pred_train < .05] = 0
confusionMatrix(pred_train, y_train)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 16387   154
##          1   283   261
##                                           
##                Accuracy : 0.9744          
##                  95% CI : (0.9719, 0.9767)
##     No Information Rate : 0.9757          
##     P-Value [Acc > NIR] : 0.8679          
##                                           
##                   Kappa : 0.5314          
##  Mcnemar's Test P-Value : 0.0000000009179 
##                                           
##             Sensitivity : 0.9830          
##             Specificity : 0.6289          
##          Pos Pred Value : 0.9907          
##          Neg Pred Value : 0.4798          
##              Prevalence : 0.9757          
##          Detection Rate : 0.9591          
##    Detection Prevalence : 0.9682          
##       Balanced Accuracy : 0.8060          
##                                           
##        'Positive' Class : 0               
## 

testing

# subset predictors and criterion
x_test = as.matrix(test.ml[,-c(1,2,3,4)])
y_test = as.double(as.matrix(test.ml[, 4]))

# test on holdout sample
pred_test = predict(cv.train, newx = x_test, s=cv.train$lambda.1se, type="response")
pred_test[pred_test > .05] = 1
pred_test[pred_test < .05] = 0

# confusion matrix
confusionMatrix(pred_test, y_test)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5470   47
##          1   87   91
##                                           
##                Accuracy : 0.9765          
##                  95% CI : (0.9722, 0.9802)
##     No Information Rate : 0.9758          
##     P-Value [Acc > NIR] : 0.3862399       
##                                           
##                   Kappa : 0.564           
##  Mcnemar's Test P-Value : 0.0007542       
##                                           
##             Sensitivity : 0.9843          
##             Specificity : 0.6594          
##          Pos Pred Value : 0.9915          
##          Neg Pred Value : 0.5112          
##              Prevalence : 0.9758          
##          Detection Rate : 0.9605          
##    Detection Prevalence : 0.9687          
##       Balanced Accuracy : 0.8219          
##                                           
##        'Positive' Class : 0               
## 

svm

training

train.svm = train.ml[,-c(1,2,3)] %>%
  mutate(artifact = ifelse(artifact == 1, "yes","no"),
         artifact = as.factor(artifact))

# specify control parameters
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 3, classProbs = TRUE)

# run initial model
set.seed(101)
svmFit = train(artifact ~ ., 
               data = train.svm, 
               method = "svmLinear", 
               trControl = fitControl,
               preProcess = c("center", "scale"),
               tuneLength = 10,
               metric = "ROC",
               verbose = FALSE)
svmFit$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 619 
## 
## Objective Function Value : -609.3917 
## Training error : 0.015335 
## Probability model included.
# predict model
train_pred = predict(svmFit, newdata = train.svm, type="prob") %>%
  select(-no)

# plot cutoff v. accuracy
predicted = prediction(train_pred, train.svm$artifact, label.ordering = NULL)
perf = performance(predicted, measure = "acc")
perf.df = data.frame(cut=perf@x.values[[1]],acc=perf@y.values[[1]])

ggplot(perf.df, aes(cut, acc)) +
  geom_line()

# plot false v. true positive rate
perf = performance(predicted, measure = "tpr", x.measure = "fpr")
perf.df = data.frame(cut=perf@alpha.values[[1]],fpr=perf@x.values[[1]],tpr=perf@y.values[[1]])

ggplot(perf.df, aes(fpr, tpr)) +
  geom_line()

# plot specificity v. sensitivity
perf = performance(predicted, measure = "sens", x.measure = "spec")
perf.df = data.frame(cut=perf@alpha.values[[1]],sens=perf@x.values[[1]],spec=perf@y.values[[1]])
ggplot(perf.df, aes(spec, sens)) +
  geom_line()

ggplot(perf.df, aes(x = cut)) +
  geom_line(aes(y = sens, color = "sensitivity")) + 
  geom_line(aes(y = spec, color = "specificity"))

perf@alpha.values[[1]][which.max(perf@x.values[[1]]+perf@y.values[[1]])] # cut
## [1] 0.0307682
max(perf@x.values[[1]]+perf@y.values[[1]]) # sensitivity + specificity
## [1] 1.693839
# cut and assess accuracy in training sample
train_pred = predict(svmFit, newdata = train.svm, type="prob") %>%
  select(-no)
train_pred =  as.matrix(train_pred)
train_pred[train_pred > .09] = "yes"
train_pred[train_pred < .09] = "no"
confusionMatrix(train_pred, train.svm$artifact)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    no   yes
##        no  16577   172
##        yes    93   243
##                                               
##                Accuracy : 0.9845              
##                  95% CI : (0.9825, 0.9863)    
##     No Information Rate : 0.9757              
##     P-Value [Acc > NIR] : 0.000000000000001021
##                                               
##                   Kappa : 0.6393              
##  Mcnemar's Test P-Value : 0.000001655374433974
##                                               
##             Sensitivity : 0.9944              
##             Specificity : 0.5855              
##          Pos Pred Value : 0.9897              
##          Neg Pred Value : 0.7232              
##              Prevalence : 0.9757              
##          Detection Rate : 0.9703              
##    Detection Prevalence : 0.9803              
##       Balanced Accuracy : 0.7900              
##                                               
##        'Positive' Class : no                  
## 

testing

test.svm = test.ml[,-c(1,2,3)] %>%
  mutate(artifact = ifelse(artifact == 1, "yes","no"),
         artifact = as.factor(artifact))

# cut and assess accuracy in test sample
test_pred = predict(svmFit, newdata = test.svm, type="prob") %>%
  select(-no)
test_pred =  as.matrix(test_pred)
test_pred[test_pred > .09] = "yes"
test_pred[test_pred < .09] = "no"
confusionMatrix(test_pred, test.svm$artifact)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  5524   44
##        yes   33   94
##                                           
##                Accuracy : 0.9865          
##                  95% CI : (0.9831, 0.9893)
##     No Information Rate : 0.9758          
##     P-Value [Acc > NIR] : 0.000000007494  
##                                           
##                   Kappa : 0.7025          
##  Mcnemar's Test P-Value : 0.2545          
##                                           
##             Sensitivity : 0.9941          
##             Specificity : 0.6812          
##          Pos Pred Value : 0.9921          
##          Neg Pred Value : 0.7402          
##              Prevalence : 0.9758          
##          Detection Rate : 0.9700          
##    Detection Prevalence : 0.9777          
##       Balanced Accuracy : 0.8376          
##                                           
##        'Positive' Class : no              
## 

save models

setwd("~/Documents/code/dsnlab/automotion-test-set")
saveRDS(cv.train, "model_log_FP.rds")
saveRDS(svmFit, "model_svm_FP.rds")

weighted model

# create model weights (they sum to one)
# model_weights = ifelse(train.svm$artifact == "yes",
#                         (1/table(train.svm$artifact)[1]) * 0.5,
#                         (1/table(train.svm$artifact)[2]) * 0.5)
# 
# # use the same seed to ensure same cross-validation splits
# fitControl$seeds = svmFit$control$seeds
# 
# svmFit_weighted = train(artifact ~ .,
#                data = train.svm,
#                method = "svmLinear",
#                trControl = fitControl,
#                preProcess = c("center", "scale"),
#                tuneLength = 10,
#                metric = "ROC",
#                verbose = FALSE,
#                weights = model_weights)
# 
# svmFit_weighted$finalModel
# 
# test_pred_weighted = predict(svmFit_weighted, newdata = test.svm, type="prob") %>%
#   select(-no)
# test_pred_weighted =  as.matrix(test_pred_weighted)
# test_pred_weighted[test_pred_weighted > .07] = "yes"
# test_pred_weighted[test_pred_weighted < .07] = "no"
# confusionMatrix(test_pred_weighted, test.svm$artifact)

# # determine best cost parameter
# grid <- expand.grid(C = c(0, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 5))
# set.seed(3233)
# svm_Linear_Grid <- train(artifact ~ ., 
#                          data = train.svm, method = "svmLinear",
#                          trControl=fitControl,
#                          preProcess = c("center", "scale"),
#                          tuneGrid = grid,
#                          tuneLength = 10)
# svm_Linear_Grid
# plot(svm_Linear_Grid)
# test_pred_grid = predict(svm_Linear_Grid, newdata = test.svm)
# confusionMatrix(test_pred_grid, test.svm$artifact)

auto-motion process

training

train.man = training %>%
  gather(tile, freqtile_power, starts_with("tile")) %>%
  filter(tile %in% c("tile_1", "tile_10")) %>%
  
  # code trash based on mean, sd, and rp
  ungroup %>%
  mutate(meanDiff.mean = mean(Diff.mean, na.rm=TRUE),
         sdDiff.mean = sd(Diff.mean, na.rm=TRUE),
         meanDiff.sd = mean(Diff.sd, na.rm=TRUE),
         sdDiff.sd = sd(Diff.sd, na.rm=TRUE),
         
         # code volumes above mean thresholds as trash
         upper.mean = meanDiff.mean + 2*sdDiff.mean,
         lower.mean = meanDiff.mean - 2*sdDiff.mean,
         trash.mean = ifelse(Diff.mean > upper.mean | Diff.mean < lower.mean, 1, 0),
         
         upper.sd = meanDiff.sd + 2*sdDiff.sd,
         lower.sd = meanDiff.sd - 2*sdDiff.sd,
         trash.sd = ifelse(Diff.sd > upper.sd | Diff.sd < lower.sd, 1, 0),
         
         # code volumes with more than +/- .25mm translation or rotation in Euclidian distance
         trash.rp.tr = ifelse(euclidian_trans_deriv > .25 | euclidian_trans_deriv < -.25, 1, 0),
         trash.rp.rot = ifelse(euclidian_rot_deriv > .25 | euclidian_rot_deriv < -.25, 1, 0)) %>%
  select(-meanDiff.mean, -meanDiff.sd, -sdDiff.mean, -sdDiff.sd) %>%
  
  # code trash based on striping
  group_by(subjectID, run, tile) %>%
  mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
  ungroup() %>%
  select(-freqtile_power) %>%
  spread(tile,freqtile_power_c) %>%
  mutate(trash.stripe = ifelse(tile_1 < -.035 & tile_10 > .00025, 1, 0)) %>%
  
  # combine trash
  mutate(trash.combined = ifelse(trash.stripe == 1, 1, 0),
         trash.sum = trash.rp.tr + trash.rp.rot + trash.mean + trash.sd,
         trash.combined = ifelse((trash.rp.tr + trash.rp.rot + trash.mean + trash.sd) > 1, 1, trash.combined)) %>%

  # recode as trash if volume behind and in front are both marked as trash
  mutate(trash.combined = ifelse(trash.combined == 0 & lag(trash.combined) == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
         
  # code first volume as trash if second volume is trash
  mutate(trash.combined = ifelse(volume == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%

  # code hits
  mutate(hits = ifelse(trash.combined == 1 & (artifact == 1), "hit",
                ifelse(trash.combined == 0 & (artifact == 1), "neg",
                ifelse(trash.combined == 1 & (artifact == 0), "pos", NA))),
         label = ifelse(regexpr('.*', hits), as.character(volume), ''),
         hits = as.factor(hits)) %>%
  gather(tile, freqtile_power_c, c("tile_1", "tile_10"))

testing

test.man = testing %>%
  gather(tile, freqtile_power, starts_with("tile")) %>%
  filter(tile %in% c("tile_1", "tile_10")) %>%
  
  # code trash based on mean, sd, and rp 
  ungroup %>%
  mutate(meanDiff.mean = mean(Diff.mean, na.rm=TRUE),
         sdDiff.mean = sd(Diff.mean, na.rm=TRUE),
         meanDiff.sd = mean(Diff.sd, na.rm=TRUE),
         sdDiff.sd = sd(Diff.sd, na.rm=TRUE),
         
         # code volumes above mean thresholds as trash
         upper.mean = meanDiff.mean + 2*sdDiff.mean,
         lower.mean = meanDiff.mean - 2*sdDiff.mean,
         trash.mean = ifelse(Diff.mean > upper.mean | Diff.mean < lower.mean, 1, 0),
         
         upper.sd = meanDiff.sd + 2*sdDiff.sd,
         lower.sd = meanDiff.sd - 2*sdDiff.sd,
         trash.sd = ifelse(Diff.sd > upper.sd | Diff.sd < lower.sd, 1, 0),
         
         # code volumes with more than +/- .25mm translation or rotation in Euclidian distance
         trash.rp.tr = ifelse(euclidian_trans_deriv > .25 | euclidian_trans_deriv < -.25, 1, 0),
         trash.rp.rot = ifelse(euclidian_rot_deriv > .25 | euclidian_rot_deriv < -.25, 1, 0)) %>%
  select(-meanDiff.mean, -meanDiff.sd, -sdDiff.mean, -sdDiff.sd) %>%
  
  # code trash based on striping
  group_by(subjectID, run, tile) %>%
  mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
  ungroup() %>%
  select(-freqtile_power) %>%
  spread(tile,freqtile_power_c) %>%
  mutate(trash.stripe = ifelse(tile_1 < -.035 & tile_10 > .00025, 1, 0)) %>%
  
  # combine trash
  mutate(trash.combined = ifelse(trash.stripe == 1, 1, 0),
         trash.sum = trash.rp.tr + trash.rp.rot + trash.mean + trash.sd,
         trash.combined = ifelse((trash.rp.tr + trash.rp.rot + trash.mean + trash.sd) > 1, 1, trash.combined)) %>%
  
  # recode as trash if volume behind and in front are both marked as trash
  mutate(trash.combined = ifelse(trash.combined == 0 & lag(trash.combined) == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
         
  # code first volume as trash if second volume is trash
  mutate(trash.combined = ifelse(volume == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%

  # code hits
  mutate(hits = ifelse(trash.combined == 1 & (artifact == 1), "hit",
                ifelse(trash.combined == 0 & (artifact == 1), "neg",
                ifelse(trash.combined == 1 & (artifact == 0), "pos", NA))),
         label = ifelse(regexpr('.*', hits), as.character(volume), ''),
         hits = as.factor(hits)) %>%
  gather(tile, freqtile_power_c, c("tile_1", "tile_10"))

compare hit rates

training data

# select only one set of observations and code correct rejections
train.tab = train.man %>% 
  filter(tile == "tile_1") %>%
  mutate(hits.tot = ifelse(hits %in% "hit", "hit",
                    ifelse(hits %in% "neg", "neg",
                    ifelse(hits %in% "pos", "pos",
                    ifelse(is.na(hits), "cor.rej", hits)))))

lasso logistic regression

confusionMatrix(pred_train, y_train)$table
##           Reference
## Prediction     0     1
##          0 16387   154
##          1   283   261
confusionMatrix(pred_train, y_train)$overall[1]
## Accuracy 
## 0.974422
confusionMatrix(pred_train, y_train)$byClass[11]
## Balanced Accuracy 
##         0.8059695

svm

confusionMatrix(train_pred, train.svm$artifact)$table
##           Reference
## Prediction    no   yes
##        no  16577   172
##        yes    93   243
confusionMatrix(train_pred, train.svm$artifact)$overall[1]
##  Accuracy 
## 0.9844893
confusionMatrix(train_pred, train.svm$artifact)$byClass[11]
## Balanced Accuracy 
##         0.7899816

manual

table(train.tab$hits.tot)
## 
## cor.rej     hit     neg     pos 
##   16614     266     148      57
cor.rej = table(train.tab$hits.tot)[[1]]
hit = table(train.tab$hits.tot)[[2]]
neg = table(train.tab$hits.tot)[[3]]
pos = table(train.tab$hits.tot)[[4]]
total = cor.rej + hit + neg + pos

sprintf('Accuracy: %s', round((cor.rej + hit) / total,2))
## [1] "Accuracy: 0.99"
# balanced accuracy
sprintf('Balanced accuracy: %s', round(((cor.rej / (cor.rej + pos)) + (hit / (hit + neg))) / 2,2))
## [1] "Balanced accuracy: 0.82"

test data

test.tab = test.man %>% 
  filter(tile == "tile_1") %>%
  mutate(hits.tot = ifelse(hits %in% "hit", "hit",
                    ifelse(hits %in% "neg", "neg",
                    ifelse(hits %in% "pos", "pos",
                    ifelse(is.na(hits), "cor.rej", NA)))))

lasso logistic regression

confusionMatrix(pred_test, y_test)$table
##           Reference
## Prediction    0    1
##          0 5470   47
##          1   87   91
confusionMatrix(pred_test, y_test)$overall[1]
##  Accuracy 
## 0.9764706
confusionMatrix(pred_test, y_test)$byClass[11]
## Balanced Accuracy 
##         0.8218822

svm

confusionMatrix(test_pred, test.svm$artifact)$table
##           Reference
## Prediction   no  yes
##        no  5524   44
##        yes   33   94
confusionMatrix(test_pred, test.svm$artifact)$overall[1]
##  Accuracy 
## 0.9864794
confusionMatrix(test_pred, test.svm$artifact)$byClass[11]
## Balanced Accuracy 
##         0.8376105

manual

table(test.tab$hits.tot)
## 
## cor.rej     hit     neg     pos 
##    5532      93      45      25
cor.rej = table(test.tab$hits.tot)[[1]]
hit = table(test.tab$hits.tot)[[2]]
neg = table(test.tab$hits.tot)[[3]]
pos = table(test.tab$hits.tot)[[4]]
total = cor.rej + hit + neg + pos

sprintf('Accuracy: %s', round((cor.rej + hit) / total,2))
## [1] "Accuracy: 0.99"
# balanced accuracy
sprintf('Balanced accuracy: %s', round(((cor.rej / (cor.rej + pos)) + (hit / (hit + neg))) / 2,2))
## [1] "Balanced accuracy: 0.83"

plot composite

combine training and test data

full_man = bind_rows(train.man, test.man)

logistic regression

full_log = bind_rows(train.ml,test.ml)

# re-run lasso logistic regression on full sample
x = as.matrix(full_log[,-c(1,2,3,4)])
y = as.double(as.matrix(full_log[, 4]))
pred = predict(cv.train, newx = x, s=cv.train$lambda.1se, type="response")
pred[pred > .05] = 1
pred[pred < .05] = 0
confusionMatrix(pred, y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 21857   201
##          1   370   352
##                                            
##                Accuracy : 0.9749           
##                  95% CI : (0.9728, 0.9769) 
##     No Information Rate : 0.9757           
##     P-Value [Acc > NIR] : 0.7878           
##                                            
##                   Kappa : 0.5395           
##  Mcnemar's Test P-Value : 0.000000000002057
##                                            
##             Sensitivity : 0.9834           
##             Specificity : 0.6365           
##          Pos Pred Value : 0.9909           
##          Neg Pred Value : 0.4875           
##              Prevalence : 0.9757           
##          Detection Rate : 0.9595           
##    Detection Prevalence : 0.9683           
##       Balanced Accuracy : 0.8099           
##                                            
##        'Positive' Class : 0                
## 

svm

full_svm = bind_rows(train.ml,test.ml)  %>%
  mutate(artifact = ifelse(artifact == 1, "yes","no"),
         artifact = as.factor(artifact))

# re-run svm on full sample
full_pred = predict(svmFit, newdata = full_svm, type="prob") %>%
  select(-no)
full_pred =  as.matrix(full_pred)
full_pred[full_pred > .09] = "yes"
full_pred[full_pred < .09] = "no"
confusionMatrix(full_pred, full_svm$artifact)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    no   yes
##        no  22101   216
##        yes   126   337
##                                                
##                Accuracy : 0.985                
##                  95% CI : (0.9833, 0.9865)     
##     No Information Rate : 0.9757               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.6558               
##  Mcnemar's Test P-Value : 0.00000149           
##                                                
##             Sensitivity : 0.9943               
##             Specificity : 0.6094               
##          Pos Pred Value : 0.9903               
##          Neg Pred Value : 0.7279               
##              Prevalence : 0.9757               
##          Detection Rate : 0.9702               
##    Detection Prevalence : 0.9797               
##       Balanced Accuracy : 0.8019               
##                                                
##        'Positive' Class : no                   
## 

manual

full_man1 = full_man %>%
  filter(tile == "tile_1")
confusionMatrix(full_man1$trash.combined, full_man1$artifact)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 22043   193
##          1    82   359
##                                                
##                Accuracy : 0.9879               
##                  95% CI : (0.9864, 0.9893)     
##     No Information Rate : 0.9757               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.7169               
##  Mcnemar's Test P-Value : 0.00000000003284     
##                                                
##             Sensitivity : 0.9963               
##             Specificity : 0.6504               
##          Pos Pred Value : 0.9913               
##          Neg Pred Value : 0.8141               
##              Prevalence : 0.9757               
##          Detection Rate : 0.9720               
##    Detection Prevalence : 0.9806               
##       Balanced Accuracy : 0.8233               
##                                                
##        'Positive' Class : 0                    
## 

plot and compare models

data.plot.log = bind_cols(full_log, as.data.frame(y), as.data.frame(pred)) %>%
  mutate(hits = ifelse(y == 1 & `1` == 1, "hit",
                ifelse(y == 0 & `1` == 1, "pos",
                ifelse(y == 1 & `1` == 0, "neg", NA))),
         label = ifelse(regexpr('.*', hits), as.character(volume), ''),
         hits = as.factor(hits))

data.plot.svm = bind_cols(full_svm, as.data.frame(full_pred)) %>%
  rename("full_pred" = yes) %>%
  mutate(hits = ifelse(artifact == "yes" & full_pred == "yes", "hit",
                ifelse(artifact == "no" & full_pred == "yes", "pos",
                ifelse(artifact == "yes" & full_pred == "no", "neg", NA))),
         label = ifelse(regexpr('.*', hits), as.character(volume), ''),
         hits = as.factor(hits))

plot.comp = full_man %>%
  filter(tile == "tile_1") %>%
  select(subjectID, run, volume, euclidian_trans_deriv, hits) %>%
  rename("auto" = hits) %>%
  left_join(data.plot.log, by = c("subjectID", "run", "volume", "euclidian_trans_deriv")) %>%
  select(subjectID, run, volume, euclidian_trans_deriv, auto, hits) %>%
  rename("log" = hits) %>%
  left_join(data.plot.svm, by = c("subjectID", "run", "volume", "euclidian_trans_deriv")) %>%
  select(subjectID, run, volume, euclidian_trans_deriv, auto, log, hits) %>%
  rename("svm" = hits) %>%
  gather(model, hits, c("auto", "log", "svm")) %>%
  mutate(label = ifelse(regexpr('.*', hits), as.character(volume), ''))

ggplot(plot.comp, aes(x = volume, y = euclidian_trans_deriv)) +
  geom_line(size = .25) +
  geom_point(data = subset(plot.comp, !is.na(hits)), aes(color = hits), size = 2.5) +
  geom_text(aes(label = label), size = 1.5) +
  facet_wrap(~ subjectID + run + model, ncol = 9, scales = "free") +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A","#F21A00")) +
  theme(axis.text.x = element_text(size = 6))